Introduction
The University of California, Berkeley and San Francisco have strategic relationships with core suppliers through which its students, faculty, and staff do routine business. The daily operations of UCSF and UC Berkeley are fueled by the constant procurement of goods and services off of an e-commerce system called BearBuy. It is through this system that suppliers are enabled to host product catalogs. The Procurement department is responsible for determining the impact of the suggested pricing. Through an analysis of the proposed catalogs, the University gains visibility into any future price increases or product removals and determines if the proposed catalog is acceptable to be purchased off of.
Currently, the analysis does not take into account products of which the university has never purchased. The simple assumption that past historical spend accurately reflects forecasted purchases allows for an opportunity in an improvement in the analysis.
Benefits of Using Python/PANDAS:
The routine review of proposed catalogs, the limits of Excel and time consumption of repeated manual analysis make an automated process that can analyze and tackle the large amount data extremeley valuable. Python, and more specifically, PANDAS, is an excellent tool to run this continued analysis of determining potential price increases from the suppliers.
PANDAS allows for the easy manipulation of dataframes and the content within. The analyst inputs the correct file location of the catalogs to be analyzed as well as notes the specific campus and supplier. The code uses this data to ouptput all of the necessary content for determining if a price file is appropriate into a running log. This not only yields instant visibility into the impact of a catalog, but continuously organizes the results chronologically for easy recall.
In [17]:
##Ac Version - 1/1/15
import pandas as pd
from pandas import read_table, read_csv, merge, Series
import numpy as np
#Variables
current = read_table('EPPENDOR_Organiza_V009_20141229171002496_001.txt',error_bad_lines = False)
proposed = read_table('EPPENDOR_Organiza_V010_20141229171011596_001.txt',error_bad_lines = False)
spend= read_csv('POData_Eppendorf.csv')
campus = 'UCSF'
supplier = 'Eppendorf'
#start fixing the data --
#The wording suppliers choose is different for each one, so it is necessary to make the Units of Measure standard
#Change full words into abreviations to now actually pair up package with PK and box/BX, case with CS
spend=spend.rename(columns={'Amount/UOM & UOM':'Packaging UOM','SKU/Catalog #':'Part Number'})
def clean_up(df):
df['Packaging UOM']=df['Packaging UOM'].str.replace('.*box.*','BX').str.replace('.*package.*','PK').str.replace('.*case.*','CS').str.replace('.*each.*','EA')
df['Packaging UOM'] = df['Packaging UOM'].astype('str').str[-2:]
df['PartUOM']=df.apply(lambda x:'%s,%s' % (x['Part Number'],x['Packaging UOM']),axis=1)
df['PartUOM']= df['PartUOM'].str.replace("-", "").str.replace(",","").str.replace("/","").str.replace("_","").str.replace(".","").str.replace("'","")
List=(spend, current, proposed)
for x in List:
clean_up(x)
#Pivot out the spend data to get summary data and fewer rows
spend_summary = pd.DataFrame(pd.pivot_table(spend,values='Quantity',rows=['PartUOM']))
spend_summary
Out[17]:
In [18]:
##Import modules that have certain functions necessary to complete the analysis
import pandas as pd
from pandas import DataFrame as df
from pandas import Series
from pandas import merge
import numpy as np
import csv
#These are your variables that are unique to your file settings
Current = pd.read_table('EPPENDOR_Organiza_V009_20141229171002496_001.txt',error_bad_lines = False)
Proposed = pd.read_table('EPPENDOR_Organiza_V010_20141229171011596_001.txt',error_bad_lines = False)
Spend= pd.read_csv('POData_Eppendorf.csv')
Campus= 'UCSF'
Supplier = 'Eppendorf'
#Suppliers specifiy Units of Measure differently:
# Possible to see a variety of formats: 1/PK, Pack, PK, package
#Using the last two UOM won't work if you have package instead of 1/PK
#The wording suppliers choose is different for each one, so it is necessary to make the Units of Measure standard
#Change full words into abreviations to now actually pair up package with PK and box/BX, case with CS
Spend=Spend.rename(columns={'Amount/UOM & UOM':'Packaging UOM','SKU/Catalog #':'Part Number'})
#In Amount/UOM & UOM the unit of measure might be listed as 1/EA. This won't match up if in the Price catalog it is listed as EA
#Change each UOM column to only the LAST two characters to ensure similarities in format:
# Create a new Column called PartUOM so we can compare appropriately between all three files
def clean_up(df):
df['Packaging UOM']=df['Packaging UOM'].str.replace('.*box.*','BX').str.replace('.*package.*','PK').str.replace('.*case.*','CS').str.replace('.*each.*','EA')
df['Packaging UOM'] = df['Packaging UOM'].astype('str').str[-2:]
df['PartUOM']=df.apply(lambda x:'%s,%s' % (x['Part Number'],x['Packaging UOM']),axis=1)
df['PartUOM']= df['PartUOM'].str.replace("-", "").str.replace(",","").str.replace("/","").str.replace("_","").str.replace(".","").str.replace("'","")
List=(Spend, Current, Proposed)
for x in List:
clean_up(x)
# Merge the Proposed Catalog with the Current Catalog
# Create a new dataframe with the specific columns necessary for the analysis
comparison = merge(Current, Proposed, left_on='PartUOM', right_on='PartUOM', how='inner', suffixes=('_x', '_y'))
comparison = comparison.loc[:,['PartUOM', 'Price_x' , 'Price_y']]
#In order to properly analyze the data, we only want SKUs where they are not blank. Otherwise there is no way to tell if they should have been in a catalog
#This line of code omits all SKUS that are blank
Spend = Spend[Spend['Part Number'] >0]
Spend = Spend[Spend['Part Number'].notnull()]
#Want to get rid of SKUs where the person entered N/A. However, we don't use .str.contains because N/A COULD be a part of the SKU/Part #
List = ['N.A.','N/A','NA','n/a','n.a.','na']
Spend = Spend[-Spend['Part Number'].str.contains('Quote')]
Spend = Spend[-Spend['Part Number'].str.contains('quote')]
Spend = Spend[-Spend['Part Number'].str.contains('see attach')]
#Using .isin will filter out those SKUs where it exactly equals N/A isntead of just contains it.
Spend = Spend[-Spend['Part Number'].isin(List)]
spend_summary = pd.DataFrame(pd.pivot_table(spend,values='Quantity',rows=['PartUOM']))
#Merge the two catalog comparisons with the historic spend based on PartUOM
#Merging on 'Left' means we want all values of the Spend and only the values of the comparison that match.
analysis = merge(Spend, comparison, left_on='PartUOM', right_on='PartUOM', how='left')
#Extract only the columns that are relevant to the analysis
analysis = analysis.loc[:,[ 'Part Number','PartUOM', 'UNSPSC','Manufacturer', 'Quantity', 'Unit Price', 'Extended Price', 'Price_x' , 'Price_y','Item Type']]
#rename Price_x to Current Price and Price_y to Proposed Price for easier analysis
analysis = analysis.rename(columns={'Price_x': 'Current Price', 'Price_y': 'Proposed Price'})
###########If you only want items that are SQ Hosted Product##########################################################################
##Hashtag this first
#analysis= analysis[analysis['Item Type'] != 'NonCatalog Product']
##If there are any prices removed in the catalogs, the vendors will either put 'Price Removed' in the Proposed catalog under Current or Proposed Prices
#For the rest of the analysis we don't want 'Price Removed' since this is a string and we can't do operations on the entire column
# We do want to know how many items were removed
## We can compute that now, and then later change the 'Price Removed' to 0 so we can do calculations
#Here we sum the number of times Price Removed shows up in either of two columns to get the number of deleted items
analysis['Proposed Price']=analysis['Proposed Price'].astype('str')
analysis['Current Price'] = analysis['Current Price'].astype('str')
removed = analysis['Proposed Price'].str.contains('Price Removed').sum()
removed2 = analysis['Current Price'].str.contains('Price Removed').sum()
#I put two variables because the vendor could have the Price removed in Current or Proposed Price
print "The number of deleted items is %s and %s" %(removed,removed2)
In [19]:
#Some Price files may include $ and , in the prices so this will result in the number not being a float
#We use the if analysis...dtype to specify that if the Series is an Object, then we need to remove the $ and , and convert to float.
#Before I didn't specify this and the code omitted all data if I put .str.replace('$') when there was actually no $; Now the new code makes it applicable to any data
# Need to first omit the symbols
#Then convert to float through .astype
if analysis['Extended Price'].dtype is np.dtype('O'):
analysis['Extended Price'] = analysis['Extended Price'].str.replace(",","").str.replace("$","").astype('float32')
#In the Current and/or Proposed Catalog, some prices may be labeled as Price Removed
#Get rid of the Price Removed because these are strings and we want the entire column to consist of floats
#Replace the Price Removed with 0
if analysis['Current Price'].dtype is np.dtype('O'):
analysis['Current Price']= analysis['Current Price'].replace(to_replace= 'Price Removed' , value= 0, inplace=False)
analysis['Current Price'] = analysis['Current Price'].str.replace(",","").str.replace("$","").astype('float32')
if analysis['Proposed Price'].dtype is np.dtype('O'):
analysis['Proposed Price']= analysis['Proposed Price'].replace(to_replace= 'Price Removed' , value= 0, inplace=False)
analysis['Proposed Price'] = analysis['Proposed Price'].str.replace("$", "").str.replace(",","").astype('float32')
#Append computed values at the end of the DataFrame that helps us in the final analysis
#Compute Ext. Current Price, Proposed Ext. Price, $ Difference and % Difference
analysis['Current Ext. Price']=analysis['Quantity']*analysis['Current Price'].astype('float32')
analysis['Proposed Ext. Price'] = analysis['Quantity']*analysis['Proposed Price'].astype('float32')
analysis['$ Difference'] = analysis['Proposed Ext. Price']-analysis['Current Ext. Price']
analysis['% Difference'] = analysis['$ Difference']/analysis['Current Ext. Price']
# Let's add a validity column
# Validity answers the question, how much the comparable spend purchased is reflected in the current and proposed catalogs
# Recall, we already took out part numbers that were null or mislabeled by the user.
# We don't want this data since it won't accurately reflect an actual product.
# A SKU that is blank or mislabeled shouldn't be included in the analysis
analysis['Valid'] = analysis['Current Price'].notnull() * analysis['Proposed Price'].notnull()
## Now we can multiply the Extended prices by the Validity column to get the valid spend, valid current and proposed ext. prices
analysis['Valid Spend'] = analysis['Extended Price'] * analysis['Valid']
analysis['Valid Current Ext. Price'] = analysis['Current Ext. Price'] * analysis['Valid']
analysis['Valid Proposed Ext. Price'] = analysis['Proposed Ext. Price'] * analysis['Valid']
#This locale allows us to convert floats into currency
import locale
locale.setlocale( locale.LC_ALL, '' )
##Sum up the column values to get a total price
Spend = analysis['Extended Price'].sum()
Valid_Spend = analysis['Valid Spend'].sum()
# We can get the Validity percentage
Validity_percentage = Valid_Spend / Spend
valid_percentage = '{percent:.2%}'.format(percent= Validity_percentage)
#Sum up Current and Proposed Extended Prices
Valid_CurrExt_Price = analysis['Valid Current Ext. Price'].sum()
Valid_ProposedExt_Price = analysis['Valid Proposed Ext. Price'].sum()
Total_Catalog_Price_Difference = Valid_ProposedExt_Price - Valid_CurrExt_Price
Percent_Increase = Total_Catalog_Price_Difference/Valid_CurrExt_Price
#Understand the count of different SKU's purchased
Count_SKUS = len(analysis['Part Number'].unique())
#Get an understanding for the total number of products purchased
Sum_of_SKUS= analysis['Quantity'].sum()
#set these variables equal to currency so the end result is easier to read
spenddollars = locale.currency(Spend)
valid_spend_dollars = locale.currency(Valid_Spend)
current_price_dollars = locale.currency (Valid_CurrExt_Price)
proposed_price_dollars = locale.currency(Valid_ProposedExt_Price)
price_difference_dollars = locale.currency(Total_Catalog_Price_Difference)
#Now we want the percentage to actually look like a percentage
#Ex. .0007 will simply yield 0 if we don't format it properly
percentage = '{percent:.2%}'.format(percent=Percent_Increase)
#Notice how the variables used are %s not %d. The locale.currency function changes the float into a string.
print "The Total spend is %s and the valid spend is %s" %(spenddollars,valid_spend_dollars)
print "Validity percentage is %s \n" %valid_percentage
print "The Total Quantity of SKUS analyzed is %d" %Sum_of_SKUS
print "The number of SKUS analyzed is %d\n" %Count_SKUS
print "The Total Valid Current Extended price is %s" %current_price_dollars
print "The Total Valid Proposed Extended price is %s\n\n" %proposed_price_dollars
print "The dollar difference between Total Proposed and Total Current is %s" %price_difference_dollars
print "The percent increase is %s" %percentage
In [22]:
analysis.to_csv('../Desktop/eppen.csv')
In [65]:
#Let's now get the spend for any products that were removed
#We can create a separate dataframe where the ProposedPrice is 0 AND there exists a Current Price
#First we need to fill the Proposed Prices to 0 if it is empty
analysis['Proposed Price'] = analysis['Proposed Price'].fillna(0)
#After filling in the empty spaces with 0 we create a dataframe where the Proposed Price is ONLY zero
proposed_0= analysis[analysis['Proposed Price'] == 0]
#We don't want both Current and Proposed to be zero
#If there exists a current price and not a proposed then it shows that a price was removed
#Create a dataframe where Proposed Price is 0 and Current price is some number
currentnot_zero = proposed_0[proposed_0['Current Price'] >0]
#This variable will show the aggregated historic spend on the products that were removed
Removed_Spend = currentnot_zero['Extended Price'].sum()
#Let's put this value in currency notation
Removed_Spend_Dollars = locale.currency(Removed_Spend)
print "The Removed Spend is %s" %Removed_Spend_Dollars
In [42]:
analysis.to_csv('C:/Users/alexisperez/Documents/Catalogs for Analysis/Beckman/Analysis.csv')
In [36]:
#Create a pivot table to look at top UNSPSC's and specific numbers associated with those product categories
from pandas import pivot_table
import numpy as np
UNSPSC = pivot_table( analysis, values = ['Extended Price', 'Quantity', '$ Difference','Valid Current Ext. Price','Valid Proposed Ext. Price'], rows = ['UNSPSC'], aggfunc = np.sum)
UNSPSC['Percent Increase'] = UNSPSC['$ Difference'] / UNSPSC['Valid Current Ext. Price']
UNSPSC['Percent Increase']= UNSPSC['Percent Increase']*100
#Want to show the pivot table with the top UNSPSC's by spend
#Use .head() because some pivot tables will be too large to show all of the columns and we only care about the top 5 anyway
UNSPSC=UNSPSC.rename(columns={'Extended Price':'Spend'})
UNSPSC.sort("Spend", ascending=False).head()
In [38]:
#Create a pivot table to look at the summary of the proposed catalog impact by Manufacturer
Manufacturer = pivot_table( analysis, values = ['Extended Price', 'Quantity', '$ Difference','Valid Current Ext. Price','Valid Proposed Ext. Price'], rows = 'Manufacturer', aggfunc = np.sum)
Manufacturer['Percent Increase'] = Manufacturer['$ Difference'] / Manufacturer['Valid Current Ext. Price']
Manufacturer['Percent Increase']= Manufacturer['Percent Increase']*100
Manufacturer= Manufacturer[Manufacturer['$ Difference'].notnull()]
Manufacturer=Manufacturer.rename(columns={'Extended Price':'Spend'})
Manufacturer.sort("Spend", ascending=False)[:6]
Out[38]:
In [39]:
import datetime as dt
from datetime import datetime
from datetime import date
date = dt.date.today().strftime("%m/%d/%Y")
Summary= df([dict(Supplier=Supplier,Spend=spenddollars,Validity_Percent=valid_percentage,Price_Difference=price_difference_dollars,Percentage_of_Increase=percentage,Quantity_Removed=removed,Quantity_Removed2=removed2,Removed_Spend = Removed_Spend_Dollars, Campus = Campus, Date = date),])
Log = pd.read_csv('C:/Users/alexisperez/Documents/Log of Price Files/Price File Log.csv')
Log = Log.append(Summary)
Log.to_csv('C:/Users/alexisperez/Documents/Log of Price Files/Price File Log.csv', index = False)
In [1]:
#Update the Large CSV file of all catalogs
#Note we do have CAMPUS and SUPPLIER
import pandas as pd
from pandas import DataFrame
from pandas import read_table
import os
import datetime as dt
from datetime import datetime
from datetime import date
import fuzzy
from fuzzy import *
dmeta = fuzzy.DMetaphone()
today = dt.date.today()
Campus='UCSF'
catalog_dir = "C:/Users/alexisperez/Documents/Catalog Data/New Files/"
catalog_path = os.path.join(catalog_dir, Campus, "%s_Titles.txt" % Campus)
catalog_titles = read_table(catalog_path)
def create_uniqueID(df):
df['DMeta Manf']=df['Manufacturer Name'].astype('str').apply(lambda x: dmeta(x))
df['DMeta Manf Str']=df['DMeta Manf'].apply(lambda x:x[0] if x[1] is None else x[0] + x[1])
df['UOM Edited']=df['Packaging UOM'].str.replace('.*box.*','BX').str.replace('.*package.*','PK').str.replace('.*case.*','CS').str.replace('.*each.*','EA')
df['UOM Edited'] = df['UOM Edited'].astype('str').str[-2:]
df['Manufacturer Part Number Edited']=df['Manufacturer Part Number'].str.replace("-", "").str.replace(",","").str.replace("/","").str.replace("_","").str.replace(".","").str.replace("'","")
df['Unique ID'] = df.apply(lambda x: '%s,%s,%s' % (x['DMeta Manf Str'],x['Manufacturer Part Number Edited'],x['UOM Edited']),axis=1)
for supplier in catalog_titles['Titles']:
new_catalog = read_table(supplier)
new_catalog['Supplier'] = supplier
new_catalog['Campus/UCOP'] = Campus
new_catalog['Date Uploaded'] = today
create_uniqueID(new_catalog)
#All_files=pd.read_csv('C:/Users/alexisperez/Catalog Data/All files with Unique ID')
#All_files.append(Proposed)
In [2]:
All_files=pd.read_csv('C:/Users/alexisperez/Documents/Catalog Data/All files with Unique ID.csv')
In [ ]:
Replace= DataFrame()
for x in All_files['Campus/UCOP']:
if x == Campus:
Replace=All_files[-All_files['Supplier'].str.contains('HENRY')]
else:
Replace = All_files
Replace
In [ ]: